#!/usr/bin/python
# -*- coding: utf-8 -*-
""" $Id: htmldiff,v 1.62 2016/10/06 10:46:19 dom Exp $
"""

import atexit
import cgi
import http_auth
import httplib
import os
import re
import surbl
import sys
import tempfile
import tidy
import urlparse

from subprocess import Popen, PIPE

CONTENT_TYPE = "text/html;charset=utf-8"

Page = """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US">
<head><title>HTML Diff service</title>
<link rel="stylesheet" href="http://www.w3.org/StyleSheets/base" />
</head>
<body>

<p><a href="http://www.w3.org/"><img src="http://www.w3.org/Icons/w3c_home" alt="W3C"/></a> <a href="http://www.w3.org/2003/Editors">W3C Editors homepage</a></p>

<h1>Create Diff between HTML pages</h1>
"""
Page2 = """
<form method="GET">
<p>Address of reference document: <input name="doc1" type="url" value="%s" style="width:100%%"/></p>
<p>Address of new document: <input name="doc2" value="%s"  style="width:100%%"/></p>
<p><input type="submit" value="get Diff"/></p>
</form>

<p><strong>Tip</strong>: if the document uses the W3C convention on linking to its previous version, you can specify only the address of the new document — the previous link will be automatically detected.</p>
<h2>Diff markings</h2>
<p>This service relies on <a href="https://www.gnu.org/software/diffutils/">GNU diff</a>. The found differences are roughly marked as follow:
<ul>
<li>deleted text is shown in pink with down-arrows (as styled for a &lt;del> element)</li>
<li>where there is replacement, it’s shown in green with bi-directional arrows,</li>
<li>where there is newly inserted text, it’s yellow with up arrows (&lt;ins> element)</li>
</ul>
<address>
script $Revision: 1.62 $ of $Date: 2016/10/06 10:46:19 $<br />
by <a href="http://www.w3.org/People/Dom/">Dominique Hazaël-Massieux</a><br />based on <a href="https://dev.w3.org/cvsweb/2009/htmldiff/htmldiff.pl">Shane McCarron’ Perl script</a> wrapped in a <a href="http://dev.w3.org/cvsweb/2009/htmldiff/">Python CGI</a>
</address>
</body>
</html>
"""

def checkInputUrl(url):
    checker = surbl.SurblChecker('/usr/local/share/surbl/two-level-tlds','/afs/w3.org/pub/WWW/Systems/Server/debian/generic/usr/local/etc/surbl.whitelist')

    if  url[:5] == 'file:' or len(urlparse.urlparse(url)[0])<2:
        print "Status: 403"
        print "Content-Type: text/plain"
        print
        print "sorry, I decline to handle file: addresses"
        sys.exit()
    elif checker.isMarkedAsSpam(url):
        print "Status: 403"
        print "Content-Type: text/plain; charset=utf-8"
        print
        print "sorry, this URL matches a record known in SURBL. See http://www.surbl.org/"
        sys.exit()

def copyHeader(copy_func, source, key, header_name=None):
    value = source.get(key)
    if not value:
        return False
    elif header_name is None:
        header_name = key
    copy_func(header_name, value)
    return True

def setupRequest(source_headers):
    opener = http_auth.ProxyAuthURLopener()
    copyHeader(opener.addheader, source_headers, 'If-Modified-Since')
    copyHeader(opener.addheader, os.environ, 'REMOTE_ADDR', 'X_Forward_IP_Addr')
    return opener

def tidyFile(file):
    # option for tidy
    options = dict(tidy_mark=0,show_warnings=0,quiet=1,char_encoding='utf8')
    html5 = re.search(r"<!doctype\s+html\s*>", file.read(4096),
                      re.IGNORECASE)
    file.seek(0)
    html5_options = {"add_xml_space": "no",
                     "output_xhtml": "no",
                     "tidy_mark": "no",
                     "new_blocklevel_tags": 'article,aside,canvas,dialog,details,figcaption,figure,footer,header,hgroup,menu,nav,section,main,summary,math,semantics,mrow,mfenced,mtable,mtr,mtd,mi,mn,msub,mo,mfrac,munderover,mtext,svg,g,image,rect,text,desc,line,path,polygon,ellipse,tspan,defs,feoffset,fecolormatrix,filter,fegaussianblur,feblend,marker,circle',
                     "new_inline_tags": 'video,audio,canvas,ruby,rt,rp,time,meter,progress,track,source,emu-val,emu-nt,emu-t,mark',
                     "break_before_br": "no",
                     "vertical_space": "no",
                     "enclose_text": "no",
                     "numeric_entities": "yes",
                     "wrap": "1000",
                     "wrap_attributes": "no",
                     "drop_empty_paras": "no"
                     }
    if html5:
        options.update(html5_options)
    newtidy = tidy.parseString(file.read(), **options)
    if len(newtidy.errors) > 0:
        if not html5:
            file.seek(0)
            options.update(html5_options)
            newtidy = tidy.parseString(file.read(), **options)
    file.close()
    file = tempfile.NamedTemporaryFile(
        mode='w+', prefix='htmldiff-', suffix='.html')
    atexit.register(file.close)
    file.write(str(newtidy))
    file.flush()
    file.seek(0)
    return (file, newtidy.errors)

def matchPredecessorRel(rel):
    return rel and "predecessor-version" in rel.lower().split(" ")

def mirrorURL(url, opener):
    try:
        filename, headers = opener.retrieve(url)
    except IOError, error:
        opener.error = "I/O error: %s %s" % (error.errno, error.strerror)
    except httplib.InvalidURL:
        opener.error = "Invalid URL submitted"
    except AttributeError:  # ProxyAuthURLopener returned None.
        pass                # There's already an error set.
    else:
        atexit.register(os.unlink, filename)
        file = open(filename)
        if headers.has_key("content-encoding") and headers["content-encoding"] == "gzip":
            import gzip
            from StringIO import StringIO
            data = StringIO(file.read())
            file.close()
            file = gzip.GzipFile(fileobj=data)
        file,errors = tidyFile(file)
        if len(errors) == 0:
            return (file, headers)
        else:
            opener.error = "Tidy errors: %s" % (str(errors))
    return (None, {})

def showPage(url1='', url2='', error_html='', **headers):
    for name, value in headers.items():
        print "%s: %s" % (name.replace('_', '-'), value)
    print
    print Page
    print error_html
    print Page2 % (url1, url2)
    sys.exit()

def serveRequest():
    fields = cgi.FieldStorage()

    if (not fields.has_key('doc2')):
        showPage(Content_Type=CONTENT_TYPE)
    # if doc1 is not specified, we load doc2 to check if it has a previous version link
    doc2 = fields['doc2'].value
    checkInputUrl(doc2)
    url_opener2 = setupRequest(fields.headers)
    newdoc, newheaders = mirrorURL(doc2, url_opener2)
    if fields.has_key('doc1'):
        doc1 = fields['doc1'].value
    elif newdoc is not None:
        from BeautifulSoup import BeautifulSoup

        soup = BeautifulSoup(newdoc.read())
        newdoc.seek(0)
        try:
            doc1 = soup.find(text=re.compile("Previous Version",re.IGNORECASE)).findNext(name="a", attrs={"href":True})["href"]
        except:
            try:
                doc1 = soup.find(name=["a", "link"], attrs={"href":True, rel:matchPredecessorRel})["href"]
            except:
                doc1 = None
    else:
        doc1 = None
    if (not doc1):
        showPage(Content_Type=CONTENT_TYPE)

    checkInputUrl(doc1)
    esc1 = cgi.escape(doc1, True)
    esc2 = cgi.escape(doc2, True)
    urlcomponents1 = urlparse.urlparse(doc1)
    urlcomponents2 = urlparse.urlparse(doc2)
    # if same domain, we can use the same urlopener
    # otherwise, we create a separate one
    if urlcomponents2[1] == urlcomponents1[1]:
        url_opener = url_opener2
    else:
        url_opener = setupRequest(fields.headers)

    refdoc, refheaders = mirrorURL(doc1, url_opener)
    if not (refdoc and newdoc):
        http_error = ""
        url = ""
        if not refdoc:
            http_error = url_opener.error
            url = esc1
        else:
            http_error = url_opener2.error
            url = esc2
        if re.match("^[1234][0-9][0-9] ", http_error):
            print "Status: %s" %(http_error)
        error="<p style='color:#FF0000'>An error (%s) occured trying to get <a href='%s'>%s</a>.</p>" % (cgi.escape(http_error), url, url)
        showPage(esc1, esc2, error, Content_Type=CONTENT_TYPE)

    print "Content-Type: text/html"
    if newheaders.has_key('Content-Type'):
        contentType = cgi.parse_header(newheaders["Content-Type"])
        if contentType[1].has_key('charset'):
            charset = contentType[1]['charset'].lower()
            #if charset == "iso-8859-1":
            #    options["char_encoding"]='latin1'

    for proxy_header in ('Last-Modified', 'Expires'):
        if copyHeader(lambda header, value: sys.stdout.write("%s: %s" %(header, value)), newheaders, proxy_header):
            print
    print
    p = Popen(["/usr/local/bin/htmldiff", refdoc.name, newdoc.name],
              stdin=PIPE, stdout=PIPE, stderr=PIPE)
    sys.stdout.flush()
    sys.stderr.flush()
    (out, err) = p.communicate()
    p.stdin.close()
    if err:
        error = "<p style='color:#FF0000'>An error occured when running <code>htmldiff</code> on the documents:</p><pre>%s</pre>" % (cgi.escape(err),)
        showPage(esc1, esc2, error)
    else:
        print out
if __name__ == '__main__':
    if os.environ.has_key('SCRIPT_NAME'):
        serveRequest()
